package cn.mzcode.recheck.util;

import lombok.extern.slf4j.Slf4j;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.POIXMLException;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;

import java.io.*;
import java.util.ArrayList;
import java.util.List;

/**
 * @author: 马壮
 * @create: 2020-04-28 23:53
 * @description: 文件工具类
 */
@Slf4j
public class FileUtil {

    /**
     * 迭代删除文件夹
     *
     * @param dirPath 文件夹路径
     */
    public static void deleteDir(String dirPath) {
        File file = new File(dirPath);
        if (!file.exists()) {
            return;
        }
        if (!file.isFile()) {
            File[] files = file.listFiles();
            if (files != null) {
                for (int i = 0; i < files.length; i++) {
                    deleteDir(files[i].getAbsolutePath());
                }
            }
        }
        file.delete();
    }

    /**
     * 读取指定pdf文档
     *
     * @param filePath 文件路径
     * @return pdf文本
     */
    public static String readPdf(String filePath) {
        String content = "";
        try (PDDocument doc = PDDocument.load(new File(filePath))) {
            PDFTextStripper textStripper = new PDFTextStripper();
            content = textStripper.getText(doc);
        } catch (Exception e) {
            log.error("[FileUtil]读取pdf文档出现错误,filePath={}", filePath, e);
        }
        return content.replaceAll("( |\r|\n|\t)", "");
    }

    /**
     * 读取指定Word文档
     *
     * @param filePath 文件路径
     * @return 文档中的纯文本
     */
    public static String readWord(String filePath) {
        String text = "";
        try (InputStream is = new FileInputStream(filePath)) {
            try {
                XWPFDocument doc = new XWPFDocument(is);
                XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
                text = extractor.getText();
            } catch (POIXMLException e) {
                InputStream is1 = new FileInputStream(filePath);
                WordExtractor extractor = new WordExtractor(is1);
                text = extractor.getText();
            }
        } catch (Exception e) {
            log.error("[FileUtil]读取Word文档出现错误,filePath={}", filePath, e);
        }
        return text.toString().replaceAll("( |\r|\n|\t)", "");
    }

    /**
     * 读取指定txt文档
     *
     * @param path 文件路径
     * @return txt文本
     */
    public static String readTxt(String path) {
        StringBuilder content = new StringBuilder("");
        try {
            String code = resolveCode(path);
            File file = new File(path);
            InputStream is = new FileInputStream(file);
            InputStreamReader isr = new InputStreamReader(is, code);
            BufferedReader br = new BufferedReader(isr);
            String str = "";
            while (null != (str = br.readLine())) {
                content.append(str);
            }
            br.close();
        } catch (Exception e) {
            log.error("[FileUtil]读取txt失败,path={}", path, e);
        }
        return content.toString().replaceAll("( |\r|\n|\t)", "");
    }

    /**
     * 获取文件编码
     *
     * @param sourceFile 文件全路径
     * @return
     * @throws Exception
     */
    public static String resolveCode(String sourceFile) throws Exception {
        String charset = "GBK";
        byte[] first3Bytes = new byte[3];
        try {
            boolean checked = false;
            BufferedInputStream bis = new BufferedInputStream(new FileInputStream(sourceFile));
            bis.mark(0);
            int read = bis.read(first3Bytes, 0, 3);
            if (read == -1) {
                //文件编码为 ANSI
                return charset;
            } else if (first3Bytes[0] == (byte) 0xFF
                    && first3Bytes[1] == (byte) 0xFE) {
                //文件编码为 Unicode
                charset = "UTF-16LE";
                checked = true;
            } else if (first3Bytes[0] == (byte) 0xFE
                    && first3Bytes[1] == (byte) 0xFF) {
                //文件编码为 Unicode big endian
                charset = "UTF-16BE";
                checked = true;
            } else if (first3Bytes[0] == (byte) 0xEF
                    && first3Bytes[1] == (byte) 0xBB
                    && first3Bytes[2] == (byte) 0xBF) {
                //文件编码为 UTF-8
                charset = "UTF-8";
                checked = true;
            }
            bis.reset();
            if (!checked) {
                int loc = 0;
                while ((read = bis.read()) != -1) {
                    loc++;
                    if (read >= 0xF0) {
                        break;
                    }
                    // 单独出现BF以下的，也算是GBK
                    if (0x80 <= read && read <= 0xBF) {
                        break;
                    }
                    if (0xC0 <= read && read <= 0xDF) {
                        read = bis.read();
                        // 双字节 (0xC0 - 0xDF)
                        if (!(0x80 <= read && read <= 0xBF)) {
                            break;
                        }
                        // 也有可能出错，但是几率较小
                    } else if (0xE0 <= read) {
                        read = bis.read();
                        if (0x80 <= read && read <= 0xBF) {
                            read = bis.read();
                            if (0x80 <= read && read <= 0xBF) {
                                charset = "UTF-8";
                                break;
                            } else {
                                break;
                            }
                        } else {
                            break;
                        }
                    }
                }
            }
            bis.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
        return charset;
    }

    /**
     * 读取txt文本，读取后删除文件
     *
     * @param filePath 文件路径
     * @return
     */
    public static String readTxtWithDelete(String filePath) {
        StringBuffer result = new StringBuffer();
        File file = new File(filePath);
        try (BufferedReader input = new BufferedReader(new FileReader(file))) {
            String strLine = "";
            while ((strLine = input.readLine()) != null) {
                result.append(strLine);
            }
        } catch (Exception e) {
            log.error("[FileUtil]读取TXT文档出现错误", e);
        } finally {
            file.delete();
        }
        return result.toString().replaceAll("( |\r|\n|\t)", "");
    }

    /**
     * 写入TXT，追加写入
     *
     * @param filePath 文件路径
     * @param content  字符串
     */
    public static void appendToTxt(String filePath, String content) {
        File file = new File(filePath);
        try (FileOutputStream fos = new FileOutputStream(file, true)) {
            //指定以UTF-8格式写入文件
            OutputStreamWriter osw = new OutputStreamWriter(fos, "UTF-8");
            osw.write(content);
            osw.close();
        } catch (IOException e) {
            log.error("[FileUtil]文件写入失败！" + e);
        }
    }

    /**
     * 创建文件
     *
     * @param fileName
     * @return
     */
    public static boolean createFile(String fileName) throws Exception {
        try {
            File file = new File(fileName);
            if (!file.exists()) {
                file.createNewFile();
            }
        } catch (Exception e) {
            log.error("[FileUtil]文件创建失败！" + e);
        }
        return true;
    }

    /**
     * 获取指定路径内所有文件名
     *
     * @param path 路径名
     * @return
     * @throws Exception
     */
    public static List<String> getFilesPath(String path) throws Exception {
        //目标集合fileList
        List<String> fileList = new ArrayList<>();
        File file = new File(path);
        if (file.isDirectory()) {
            File[] files = file.listFiles();
            for (File f : files) {
                if (f.isFile() && !f.getName().startsWith("~$")) {
                    fileList.add(f.getAbsolutePath());
                }
            }
        }
        return fileList;
    }

    /**
     * 读取文件内容，支持txt、pdf、doc、docx
     *
     * @param path 文件路径
     * @return 内容文本
     */
    public static String read(String path) {
        if (path.endsWith(".txt")) {
            return readTxt(path);
        } else if (path.endsWith(".pdf")) {
            return readPdf(path);
        } else if (path.endsWith(".doc") || path.endsWith(".docx")) {
            return readWord(path);
        } else {
            return null;
        }
    }
}
